In [258]:
# Basic libraries 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 

# Machine learning related models 
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder,  PowerTransformer, FunctionTransformer, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import IsolationForest

from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif

# Statistics
from scipy.stats import ttest_ind, f_oneway

import warnings 
warnings.filterwarnings('ignore')
In [257]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest

class EDA_basic:
    def __init__(self, df=None, link=None, category_features=None):
        if df is not None:
            self.df = df
        elif link is not None:
            self.df = pd.read_csv(link)
        else:
            raise ValueError("Either 'df' or 'link' must be provided.")
        
        self.category_features = category_features if category_features is not None else []
        self.df.columns = self.df.columns.str.lower().str.replace(' ', '_')
        
    def basic_statistical_summary(self):
        print(self.df.head(10)) # The first 10 rows 
        print(self.df.info()) # Check the data types of all features 
        print(self.df.isna().any()) # Check if the data has missing values which it doesn't 
        print(self.df.describe()) 
        # This method provides basic statistics about the dataset
    
    def features_dtype(self):
        categorical_features = [feature for feature in self.df.columns if self.df[feature].dtype == 'O']
        # Help me give featues that are cateogrical 
        numerical_features = [feature for feature in self.df.columns if self.df[feature].dtype != 'O']
        # Help me give featuers that are numerical 
        
        # Then, I will return those lists 
        return categorical_features, numerical_features

    
    def skewness_(self):
        _, numerical_features = self.features_dtype()
        # I am using the previous function to get numerical features and print out the degree of skewness 
        print('The degree of skewness for each numerical feature: \n')
        result = {feature: self.df[feature].skew() for feature in numerical_features} # Put the result in a dictionary

        result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True)) # Then sort the result 
        for feature, skewness in result.items():
            print(f'{feature}: {skewness:.2f}')    
        # Check for skewness, whether the distribution is right or left skewed, and the degree of the skewness will be sorted 
    def histogram_plot_numerical(self):
        # Get the numerical features 
        _, numerical_features = self.features_dtype()
        n = len(numerical_features) # Get the number of the numerical features 
        cols = 3 # Specify the number of cols for subplots 
        rows = (n + cols - 1) // cols # Get the rows according to the cols' number
        fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(15, 5 * rows)) # Specify the subplots 
        axes = axes.flatten()
        
        # This subplots will present the distributions of each numerical features for outliers detections and whether 
        # the data needs to be transformed 
        for i, feature in enumerate(numerical_features):
            # Plot out histogram for each of the features 
            sns.histplot(self.df[feature], bins=30, alpha=0.7, ax=axes[i], label=feature, kde=True)
            axes[i].set_xlabel(feature)
            axes[i].set_ylabel('Frequency')
            axes[i].legend()

        for j in range(i + 1, len(axes)):
            fig.delaxes(axes[j])

        plt.tight_layout()
        plt.show() # Show the plot 

    
    def target_imbalance_plot_check(self, target):
        # As the characteristic of churn rate problems, the target will have imbalanced values 
        ax = self.df[target].value_counts(normalize=True).plot(kind='bar') # Check the percentages of each values for 
        # imbalance detection
        for p in ax.patches:
            ax.annotate(f'{p.get_height():.2%}', (p.get_x() + p.get_width() / 2., p.get_height()),
                        ha='center', va='center', xytext=(0, 5), textcoords='offset points')
        # Adding annotation for better and more informative visualization
        plt.xlabel(target)
        plt.ylabel(f'{target} rate')
        plt.show()
        
    def outliers_detection(self):
        # Using ensemble method which is Isolation forest for outlier detection that takes into account other features 
        
        iso = IsolationForest(contamination=0.05, random_state=42) # Setting 5% of the data points as outliers 
        le = LabelEncoder() # Deal with categorical features 
        categorical_feature, _ = self.features_dtype() # Get the categorical features 
        for feature in categorical_feature:
            self.df[feature] = le.fit_transform(self.df[feature]) # Transform categorical features to numerical features 
        features = self.df.drop('churn', axis=1) # Exclude the target feature 
        iso.fit(features) 
        # Outlier prediction 
        self.df['outliers'] = iso.predict(features) # Create another column for outliers which have the value of -1 if outliers 
        # else 1 
        
        # Count out the number of outliers 
        count_outliers = self.df['outliers'].value_counts()
        print(f'There are {count_outliers[-1]} number of outliers')

        outliers = self.df[self.df['outliers'] == -1] # Get outliers that take the value of -1 
        inliers = self.df[self.df['outliers']==1] # Get inliers - normal data points, takes the value of 1 
        print(outliers)
        return outliers # I only what to return the outliers 

df = pd.read_csv(r"C:\Users\Admin\Downloads\churn-bigml-80.csv")
eda = EDA_basic(df=df)
eda.basic_statistical_summary()
categorical_features, numerical_features = eda.features_dtype()
eda.histogram_plot_numerical()
eda.skewness_()
eda.target_imbalance_plot_check('churn')
outliers = eda.outliers_detection()
  state  account_length  area_code international_plan voice_mail_plan  \
0    KS             128        415                 No             Yes   
1    OH             107        415                 No             Yes   
2    NJ             137        415                 No              No   
3    OH              84        408                Yes              No   
4    OK              75        415                Yes              No   
5    AL             118        510                Yes              No   
6    MA             121        510                 No             Yes   
7    MO             147        415                Yes              No   
8    WV             141        415                Yes             Yes   
9    RI              74        415                 No              No   

   number_vmail_messages  total_day_minutes  total_day_calls  \
0                     25              265.1              110   
1                     26              161.6              123   
2                      0              243.4              114   
3                      0              299.4               71   
4                      0              166.7              113   
5                      0              223.4               98   
6                     24              218.2               88   
7                      0              157.0               79   
8                     37              258.6               84   
9                      0              187.7              127   

   total_day_charge  total_eve_minutes  total_eve_calls  total_eve_charge  \
0             45.07              197.4               99             16.78   
1             27.47              195.5              103             16.62   
2             41.38              121.2              110             10.30   
3             50.90               61.9               88              5.26   
4             28.34              148.3              122             12.61   
5             37.98              220.6              101             18.75   
6             37.09              348.5              108             29.62   
7             26.69              103.1               94              8.76   
8             43.96              222.0              111             18.87   
9             31.91              163.4              148             13.89   

   total_night_minutes  total_night_calls  total_night_charge  \
0                244.7                 91               11.01   
1                254.4                103               11.45   
2                162.6                104                7.32   
3                196.9                 89                8.86   
4                186.9                121                8.41   
5                203.9                118                9.18   
6                212.6                118                9.57   
7                211.8                 96                9.53   
8                326.4                 97               14.69   
9                196.0                 94                8.82   

   total_intl_minutes  total_intl_calls  total_intl_charge  \
0                10.0                 3               2.70   
1                13.7                 3               3.70   
2                12.2                 5               3.29   
3                 6.6                 7               1.78   
4                10.1                 3               2.73   
5                 6.3                 6               1.70   
6                 7.5                 7               2.03   
7                 7.1                 6               1.92   
8                11.2                 5               3.02   
9                 9.1                 5               2.46   

   customer_service_calls  churn  
0                       1  False  
1                       1  False  
2                       0  False  
3                       2  False  
4                       3  False  
5                       0  False  
6                       3  False  
7                       0  False  
8                       0  False  
9                       0  False  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   2666 non-null   object 
 1   account_length          2666 non-null   int64  
 2   area_code               2666 non-null   int64  
 3   international_plan      2666 non-null   object 
 4   voice_mail_plan         2666 non-null   object 
 5   number_vmail_messages   2666 non-null   int64  
 6   total_day_minutes       2666 non-null   float64
 7   total_day_calls         2666 non-null   int64  
 8   total_day_charge        2666 non-null   float64
 9   total_eve_minutes       2666 non-null   float64
 10  total_eve_calls         2666 non-null   int64  
 11  total_eve_charge        2666 non-null   float64
 12  total_night_minutes     2666 non-null   float64
 13  total_night_calls       2666 non-null   int64  
 14  total_night_charge      2666 non-null   float64
 15  total_intl_minutes      2666 non-null   float64
 16  total_intl_calls        2666 non-null   int64  
 17  total_intl_charge       2666 non-null   float64
 18  customer_service_calls  2666 non-null   int64  
 19  churn                   2666 non-null   bool   
dtypes: bool(1), float64(8), int64(8), object(3)
memory usage: 398.5+ KB
None
state                     False
account_length            False
area_code                 False
international_plan        False
voice_mail_plan           False
number_vmail_messages     False
total_day_minutes         False
total_day_calls           False
total_day_charge          False
total_eve_minutes         False
total_eve_calls           False
total_eve_charge          False
total_night_minutes       False
total_night_calls         False
total_night_charge        False
total_intl_minutes        False
total_intl_calls          False
total_intl_charge         False
customer_service_calls    False
churn                     False
dtype: bool
       account_length    area_code  number_vmail_messages  total_day_minutes  \
count     2666.000000  2666.000000            2666.000000         2666.00000   
mean       100.620405   437.438860               8.021755          179.48162   
std         39.563974    42.521018              13.612277           54.21035   
min          1.000000   408.000000               0.000000            0.00000   
25%         73.000000   408.000000               0.000000          143.40000   
50%        100.000000   415.000000               0.000000          179.95000   
75%        127.000000   510.000000              19.000000          215.90000   
max        243.000000   510.000000              50.000000          350.80000   

       total_day_calls  total_day_charge  total_eve_minutes  total_eve_calls  \
count      2666.000000       2666.000000        2666.000000      2666.000000   
mean        100.310203         30.512404         200.386159       100.023631   
std          19.988162          9.215733          50.951515        20.161445   
min           0.000000          0.000000           0.000000         0.000000   
25%          87.000000         24.380000         165.300000        87.000000   
50%         101.000000         30.590000         200.900000       100.000000   
75%         114.000000         36.700000         235.100000       114.000000   
max         160.000000         59.640000         363.700000       170.000000   

       total_eve_charge  total_night_minutes  total_night_calls  \
count       2666.000000          2666.000000        2666.000000   
mean          17.033072           201.168942         100.106152   
std            4.330864            50.780323          19.418459   
min            0.000000            43.700000          33.000000   
25%           14.050000           166.925000          87.000000   
50%           17.080000           201.150000         100.000000   
75%           19.980000           236.475000         113.000000   
max           30.910000           395.000000         166.000000   

       total_night_charge  total_intl_minutes  total_intl_calls  \
count         2666.000000         2666.000000       2666.000000   
mean             9.052689           10.237022          4.467367   
std              2.285120            2.788349          2.456195   
min              1.970000            0.000000          0.000000   
25%              7.512500            8.500000          3.000000   
50%              9.050000           10.200000          4.000000   
75%             10.640000           12.100000          6.000000   
max             17.770000           20.000000         20.000000   

       total_intl_charge  customer_service_calls  
count        2666.000000             2666.000000  
mean            2.764490                1.562641  
std             0.752812                1.311236  
min             0.000000                0.000000  
25%             2.300000                1.000000  
50%             2.750000                1.000000  
75%             3.270000                2.000000  
max             5.400000                9.000000  
No description has been provided for this image
The degree of skewness for each numerical feature: 

churn: 2.01
total_intl_calls: 1.36
number_vmail_messages: 1.27
area_code: 1.11
customer_service_calls: 1.10
account_length: 0.08
total_night_minutes: 0.02
total_night_charge: 0.02
total_night_calls: 0.01
total_eve_charge: -0.01
total_eve_minutes: -0.01
total_day_charge: -0.05
total_day_minutes: -0.05
total_eve_calls: -0.07
total_day_calls: -0.13
total_intl_minutes: -0.22
total_intl_charge: -0.22
No description has been provided for this image
There are 134 number of outliers
      state  account_length  area_code  international_plan  voice_mail_plan  \
8        49             141        415                   1                1   
27       18             172        408                   0                0   
35       20             135        408                   1                1   
83       12              98        510                   0                1   
93       21              36        510                   1                1   
...     ...             ...        ...                 ...              ...   
2581     34             150        415                   0                1   
2587      8              75        510                   0                1   
2597     27              77        408                   1                1   
2598     36             146        510                   0                0   
2631     22             119        510                   1                1   

      number_vmail_messages  total_day_minutes  total_day_calls  \
8                        37              258.6               84   
27                        0              212.0              121   
35                       41              173.1               85   
83                       21              161.2              114   
93                       42              196.8               89   
...                     ...                ...              ...   
2581                     35              139.6               72   
2587                     28              200.6               96   
2597                     44              103.2              117   
2598                      0              138.4              104   
2631                     22              172.1              119   

      total_day_charge  total_eve_minutes  ...  total_eve_charge  \
8                43.96              222.0  ...             18.87   
27               36.04               31.2  ...              2.65   
35               29.43              203.9  ...             17.33   
83               27.40              252.2  ...             21.44   
93               33.46              254.9  ...             21.67   
...                ...                ...  ...               ...   
2581             23.73              332.8  ...             28.29   
2587             34.10              164.1  ...             13.95   
2597             17.54              236.3  ...             20.09   
2598             23.53              158.9  ...             13.51   
2631             29.26              223.6  ...             19.01   

      total_night_minutes  total_night_calls  total_night_charge  \
8                   326.4                 97               14.69   
27                  293.3                 78               13.20   
35                  122.2                 78                5.50   
83                  160.2                 92                7.21   
93                  138.3                126                6.22   
...                   ...                ...                 ...   
2581                213.8                105                9.62   
2587                169.6                153                7.63   
2597                203.5                101                9.16   
2598                 47.4                 73                2.13   
2631                150.0                 94                6.75   

      total_intl_minutes  total_intl_calls  total_intl_charge  \
8                   11.2                 5               3.02   
27                  12.6                10               3.40   
35                  14.6                15               3.94   
83                   4.4                 8               1.19   
93                  20.0                 6               5.40   
...                  ...               ...                ...   
2581                 8.8                 2               2.38   
2587                 2.5                 5               0.68   
2597                11.9                 2               3.21   
2598                 3.9                 9               1.05   
2631                13.9                20               3.75   

      customer_service_calls  churn  outliers  
8                          0  False        -1  
27                         3  False        -1  
35                         0   True        -1  
83                         4  False        -1  
93                         0   True        -1  
...                      ...    ...       ...  
2581                       2  False        -1  
2587                       1  False        -1  
2597                       0   True        -1  
2598                       4   True        -1  
2631                       1   True        -1  

[134 rows x 21 columns]

First I will specifize which featueres are categoircal or numerical and if there are any needed to be converted

In [6]:
df_copy = df.copy()
category_fetures = ['area_code', 'state', 'international_plan', 'voice_mail_plan']
for i in category_fetures:
    df[i]= df[i].astype('category')
In [7]:
numerical_features = [feature for feature in df.columns if feature not in category_fetures and feature !='churn']
numerical_features
Out[7]:
['account_length',
 'number_vmail_messages',
 'total_day_minutes',
 'total_day_calls',
 'total_day_charge',
 'total_eve_minutes',
 'total_eve_calls',
 'total_eve_charge',
 'total_night_minutes',
 'total_night_calls',
 'total_night_charge',
 'total_intl_minutes',
 'total_intl_calls',
 'total_intl_charge',
 'customer_service_calls']

I have converted the 4 features in the categorical_features list above, which previously were objects, to categorical features becuase this helped reduce the memory usage. This can help the data run faster and more precise later on

I just plotted out the numerical data for checking the distributions

  • Most of them are normally distributed which is a good sign
  • There are a few features that don't follow the normal distributions namely:
    • number_vmail_messages: The number of voice mail messages the customer has.
    • customer_service_calls: The number of calls the customer made to customer service.

I believe that these features are inheritely non-normal distribution. Moreover, I can see that in the dataset, the customer don't use voice mail messages much, there are some outliers in this features.

Secondly the number of customer service calls made is frequently one, more than 3 or 4 are not frequently which is undrestandable

I just performed checking the skewnes of the numerical features. The results are quite predictable becuase of the histogram plots of them

  • Three most notable right skewed (positive values) features are:
    • total_intl_calls: Total number of international calls made by the customer
    • number_vmail_messages: The number of voice mail messages the customer has
    • customer_service_calls: The number of calls the customer made to customer service
  • There are four closely normally distributed but slightly right skewed (with positive values) are:
    • account_length: The number of days the customer has had an account with the company
    • total_night_minutes: Total minutes of calls made by the customer during the night
    • total_night_charge: Total charges incurred by the customer for calls made during the night
    • total_night_calls: Total number of calls made by the customer during the night

I believe that these features are not highly skewed which may not need transformations

  • There are eight left-skewed distributions but not severely, therefore, it's not necesarry to transform these features

As expected the churn rate is not blanaced which may required being delt with later on in the process

  1. Churn analysis by state
  • Group the data by the state column and calculate the churn rate for each state to see which states have the highest and lowest churn rates and identify patterns
  1. International plan impact
  • Group the data by the international plan column and calculate aggregate statistics for numerical features to determine if there's a significant difference in usage patterns between customers with and without international plans
  1. Voice mail plan analysis
  • Do the same with international plan
In [129]:
category_features
le = LabelEncoder()
for feature in category_features:
    df_copy[feature] = le.fit_transform(df_copy[feature])
    
plt.figure(figsize=(15, 12))
sns.pairplot(df, hue='churn')
Out[129]:
<seaborn.axisgrid.PairGrid at 0x2c6cf7c2310>
<Figure size 1500x1200 with 0 Axes>
No description has been provided for this image

Outlier detections using Isolation Forest¶

I can see that there are some outliers that need to be removed. I will use Isolation Forest to handle this in the ML building process

Segmentation for churn rate¶

In [259]:
class CustomerSegmentation:
    def __init__(self, data):
        self.df = data.copy()
        self.df_orig_with_clusters = None
        self.df_seg = None
        self.tsne_components = None

    def preprocess_data(self):
        self.df['churn'] = self.df['churn'].astype(int) # Convert churn rate from string to interger 
        self.df.drop(['state', 'area_code'], axis=1, inplace=True) # Removed these two because the results increased 
        
        le = LabelEncoder() # Use labebl encoder to encode categorical features 
        categorical_features = ['international_plan', 'voice_mail_plan']
        for feature in categorical_features: # Loop through the list 
            self.df[feature] = le.fit_transform(self.df[feature])
        
    def detect_outliers(self): # To detect outliers and remove them for improving the clustering results 
        iso = IsolationForest(contamination=0.05, random_state=42) # Use IsolationForest 
        iso.fit(self.df.drop('churn', axis=1)) # Fit the data that excluded out churn rate 
        self.df['outliers'] = iso.predict(self.df.drop('churn', axis=1)) # Create another column 
        # for outliers' record in format 
        # -1 (Outlier) and 1 
        outliers = self.df[self.df['outliers'] == -1].index # Getting the index where the outliers are at 
        self.df.drop(outliers, inplace=True) # Drop the outliers  
        self.df.drop('outliers', axis=1, inplace=True) # Drop the outlier column 
    
    def normalize_features(self):
        # Normalization features for getting all numerical features in the same range 
        numerical_features = self.df.drop(['churn'], axis=1).columns
        scaler = RobustScaler() # Use Robust Scaler because it improved the clustering results 
        self.df[numerical_features] = scaler.fit_transform(self.df[numerical_features])
        # Fit to the numerical data points 
        
    def perform_tsne(self):
        tsne = TSNE(n_components=2, random_state=42)
        self.tsne_components = tsne.fit_transform(self.df.drop(['churn'], axis=1))
        # Perform TSNE to reduce the dimension to 2 
        
    def kmeans_clustering(self, n_clusters=3): # For segmentation, I use Kmeans clustering 
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=15, max_iter=800, init='k-means++')
        # Contruct 3 clusters  and fit to the the TSNE components that previously created and predict 
        # The cluster in the a new column
        self.df['cluster'] = kmeans.fit_predict(self.tsne_components)
        # Clusters evaluation
        silhouette_avg = silhouette_score(self.tsne_components, self.df['cluster']) # To evaluate the results
        calinski_harabasz_idx = calinski_harabasz_score(self.tsne_components, self.df['cluster']) # Alternative to evaluate 
        print(f'Silhouette Score: {silhouette_avg:.2f}')
        print(f'Calinski-Harabasz Index: {calinski_harabasz_idx:.2f}')
    
    def visualize_clusters(self):
        # Plot out the clusters and seperate the data point based on the clusters 
        plt.figure(figsize=(10, 6))
        # Use scatter plot 
        sns.scatterplot(x=self.tsne_components[:, 0], y=self.tsne_components[:, 1], hue=self.df['cluster'], palette='viridis')
        plt.xlabel('t-SNE Component 1')
        plt.ylabel('t-SNE Component 2')
        plt.title('t-SNE of K-Means Clusters')
        plt.show()
    
    def analyze_clusters(self):
        self.df_orig_with_clusters = self.df.copy()
        # Get numerical features 
        numerical_columns = self.df.select_dtypes(include=[np.number]).columns
        cluster_summary = self.df.groupby('cluster')[numerical_columns].mean() 
        # Calculate the mean of each clusters for each features 
        print(cluster_summary)
        
        n = len(numerical_columns) - 1  # Exclude 'cluster'
        cols = 3
        rows = (n + cols - 1) // cols
        fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(15, 5 * rows))
        axes = axes.flatten()

        for i, feature in enumerate(numerical_columns):
            # I loop through numerical features excluding out the cluster column
            if feature != 'cluster':
                sns.boxplot(x='cluster', y=feature, data=self.df, palette='viridis', ax=axes[i])
                axes[i].set_title(f'Distribution of {feature} by Cluster')
        # Delete axes that are not used 
        for j in range(i + 1, len(axes)):
            fig.delaxes(axes[j])

        plt.tight_layout()
        plt.show() # Plot out the clusters 

    def run(self):
        self.preprocess_data()
        self.detect_outliers()
        self.normalize_features()
        self.perform_tsne()
        self.kmeans_clustering()
        self.visualize_clusters()
        self.analyze_clusters()

# Usage
# df is your dataframe
segmentation = CustomerSegmentation(df)
segmentation.run()
Silhouette Score: 0.45
Calinski-Harabasz Index: 3645.02
No description has been provided for this image
         account_length  international_plan  voice_mail_plan  \
cluster                                                        
0              0.033025            0.081212         0.004848   
1              0.036851            0.059375         0.996875   
2             -0.005163            0.093721         0.001874   

         number_vmail_messages  total_day_minutes  total_day_calls  \
cluster                                                              
0                     0.003450          -0.087195        -0.003771   
1                     2.238702           0.031428        -0.051678   
2                     0.001586           0.038955        -0.023500   

         total_day_charge  total_eve_minutes  total_eve_calls  \
cluster                                                         
0               -0.087302          -0.031063        -0.021772   
1                0.031355           0.012908        -0.017127   
2                0.038887          -0.001004         0.015464   

         total_eve_charge  total_night_minutes  total_night_calls  \
cluster                                                             
0               -0.031779             0.051840          -0.017576   
1                0.012167            -0.021700          -0.030228   
2               -0.001744            -0.033302          -0.051691   

         total_night_charge  total_intl_minutes  total_intl_calls  \
cluster                                                             
0                  0.051249           -0.013056          0.109091   
1                 -0.022241           -0.003839          0.148958   
2                 -0.033813            0.044290          0.165573   

         total_intl_charge  customer_service_calls     churn  cluster  
cluster                                                                
0                -0.008356                1.736970  0.187879      0.0  
1                 0.001047                0.457813  0.067187      1.0  
2                 0.049313               -0.309278  0.137769      2.0  
No description has been provided for this image
In [261]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import matplotlib.pyplot as plt
import seaborn as sns

class CustomerSegmentation:
    def __init__(self, data):
        self.df = data.copy()
        self.df_orig_with_clusters = None
        self.df_seg = None
        self.tsne_components = None

    def preprocess_data(self):
        self.df['churn'] = self.df['churn'].astype(int)
        self.df.drop(['state', 'area_code'], axis=1, inplace=True)
        
        le = LabelEncoder()
        categorical_features = ['international_plan', 'voice_mail_plan']
        for feature in categorical_features:
            self.df[feature] = le.fit_transform(self.df[feature])
        
    def detect_outliers(self):
        iso = IsolationForest(contamination=0.05, random_state=42)
        iso.fit(self.df.drop('churn', axis=1))
        self.df['outliers'] = iso.predict(self.df.drop('churn', axis=1))
        outliers = self.df[self.df['outliers'] == -1].index
        self.df.drop(outliers, inplace=True)
        self.df.drop('outliers', axis=1, inplace=True)
    
    def normalize_features(self):
        numerical_features = self.df.drop(['churn'], axis=1).columns
        scaler = RobustScaler()
        self.df[numerical_features] = scaler.fit_transform(self.df[numerical_features])
    
    def perform_tsne(self):
        tsne = TSNE(n_components=2, random_state=42)
        self.tsne_components = tsne.fit_transform(self.df.drop(['churn'], axis=1))
    
    def kmeans_clustering(self, n_clusters=3):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=15, max_iter=800, init='k-means++')
        self.df['cluster'] = kmeans.fit_predict(self.tsne_components)
        silhouette_avg = silhouette_score(self.tsne_components, self.df['cluster'])
        calinski_harabasz_idx = calinski_harabasz_score(self.tsne_components, self.df['cluster'])
        print(f'Silhouette Score: {silhouette_avg:.2f}')
        print(f'Calinski-Harabasz Index: {calinski_harabasz_idx:.2f}')
    
    def visualize_clusters(self):
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x=self.tsne_components[:, 0], y=self.tsne_components[:, 1], hue=self.df['cluster'], palette='viridis')
        plt.xlabel('t-SNE Component 1')
        plt.ylabel('t-SNE Component 2')
        plt.title('t-SNE of K-Means Clusters')
        plt.show()
    
    def reverse_dataset(self):
        self.df_orig_with_clusters = self.df.copy()
        self.df_orig_with_clusters.index = self.df.index
        self.df_orig_with_clusters['cluster'] = self.df['cluster'].values

    def analyze_clusters(self):
        numerical_columns = self.df_orig_with_clusters.select_dtypes(include=[np.number]).columns
        cluster_summary = self.df_orig_with_clusters.groupby('cluster')[numerical_columns].mean()
        print(cluster_summary)
        
        n = len(numerical_columns) - 1  # Exclude 'cluster'
        cols = 3
        rows = (n + cols - 1) // cols
        fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(15, 5 * rows))
        axes = axes.flatten()

        for i, feature in enumerate(numerical_columns):
            if feature != 'cluster':
                sns.boxplot(x='cluster', y=feature, data=self.df_orig_with_clusters, palette='viridis', ax=axes[i])
                axes[i].set_title(f'Distribution of {feature} by Cluster')

        for j in range(i + 1, len(axes)):
            fig.delaxes(axes[j])

        plt.tight_layout()
        plt.show()

    def run(self):
        self.preprocess_data()
        self.detect_outliers()
        self.normalize_features()
        self.perform_tsne()
        self.kmeans_clustering()
        self.visualize_clusters()
        
        # Reverse the dataset to add clusters to original data
        self.df_orig_with_clusters = self.df.copy()
        self.reverse_dataset()
        
        self.analyze_clusters()

# Usage
# df is your dataframe
segmentation = CustomerSegmentation(df)
segmentation.run()
Silhouette Score: 0.45
Calinski-Harabasz Index: 3645.02
No description has been provided for this image
         account_length  international_plan  voice_mail_plan  \
cluster                                                        
0              0.033025            0.081212         0.004848   
1              0.036851            0.059375         0.996875   
2             -0.005163            0.093721         0.001874   

         number_vmail_messages  total_day_minutes  total_day_calls  \
cluster                                                              
0                     0.003450          -0.087195        -0.003771   
1                     2.238702           0.031428        -0.051678   
2                     0.001586           0.038955        -0.023500   

         total_day_charge  total_eve_minutes  total_eve_calls  \
cluster                                                         
0               -0.087302          -0.031063        -0.021772   
1                0.031355           0.012908        -0.017127   
2                0.038887          -0.001004         0.015464   

         total_eve_charge  total_night_minutes  total_night_calls  \
cluster                                                             
0               -0.031779             0.051840          -0.017576   
1                0.012167            -0.021700          -0.030228   
2               -0.001744            -0.033302          -0.051691   

         total_night_charge  total_intl_minutes  total_intl_calls  \
cluster                                                             
0                  0.051249           -0.013056          0.109091   
1                 -0.022241           -0.003839          0.148958   
2                 -0.033813            0.044290          0.165573   

         total_intl_charge  customer_service_calls     churn  cluster  
cluster                                                                
0                -0.008356                1.736970  0.187879      0.0  
1                 0.001047                0.457813  0.067187      1.0  
2                 0.049313               -0.309278  0.137769      2.0  
No description has been provided for this image

Do some statistical tests to see the difference or association between the features and the churn rate¶

In [125]:
def t_test(df, continuous_feature, target='churn'):
    churn_yes = df[df[target] == 'True'][continuous_feature]
    churn_no  = df[df[target] == 'False'][continuous_feature]
    t_stat, p_val = ttest_ind(churn_yes, churn_no)
    
    return t_stat, p_val 

for feature in numerical_features:
    t_stat, p_val = t_test(df_copy, feature)
    if p_val < 0.05:
        print(f'Ttest: p_value: {p_val}')
        print(f'There is a significant difference between {feature} for churn')
    else:
        print('There is no significance')
            
There is no significance
There is no significance
There is no significance
There is no significance
There is no significance
There is no significance
There is no significance
There is no significance
There is no significance
There is no significance
There is no significance
There is no significance
There is no significance
There is no significance
There is no significance

Build machine learning models¶

Next i will perform data engineering data selection then try stacking methods and other ensemble method for perfomrance enhancement

I also want to do Isolation Forest to remove outliers

I also want to perform segmentation on the data to see the demographic elements that effect the churn rate, which group tends to have high churn rate and improve in that segments

And also I will think of more EDA steps using statistical tests

Moreover i need to understand and change the OOP in order for the functions to be more coherent.

I will perform kbest to find the importance feature and train another dataset on those features. I will also perfrom evaluation for feature importnace using a tree-based models

In [36]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
In [43]:
le = LabelEncoder()
df_copy_3 = df.copy()
for feature in category_fetures:
    df_copy_3[feature] = le.fit_transform(df_copy_3[feature])
X = df_copy_3.drop('churn', axis=1)
y = df_copy_3['churn']

selector = SelectKBest(score_func=chi2, k='all')
X_new= selector.fit_transform(X, y)

scores = selector.scores_
feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': scores})
feature_scores = feature_scores.sort_values(by='Score', ascending=False)

plt.figure(figsize=(12, 10))
plt.barh(feature_scores['Feature'], feature_scores['Score'], color='skyblue')
plt.show()


model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
importances = model.feature_importances_
feature_importance = pd.DataFrame({'Feature':X.columns, 'Score':importances})
feature_importance = feature_importance.sort_values(by='Score', ascending=False)

plt.figure(figsize=(12, 10))
plt.barh(feature_importance['Feature'], feature_importance['Score'], color='skyblue')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [44]:
feature_importance
Out[44]:
Feature Score
6 total_day_minutes 0.132546
8 total_day_charge 0.126868
18 customer_service_calls 0.119861
3 international_plan 0.088824
11 total_eve_charge 0.064009
9 total_eve_minutes 0.061374
16 total_intl_calls 0.048108
17 total_intl_charge 0.043624
15 total_intl_minutes 0.042776
14 total_night_charge 0.038501
12 total_night_minutes 0.036655
7 total_day_calls 0.032440
1 account_length 0.031766
13 total_night_calls 0.030783
10 total_eve_calls 0.028500
0 state 0.025838
5 number_vmail_messages 0.022368
4 voice_mail_plan 0.017017
2 area_code 0.008142

I can see that there are some features that are not relevant and redundant to the target. Therefore I will consider removing them using KBest

In [144]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler, LabelEncoder, PowerTransformer, FunctionTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt

class ChurnPredictionPipeline:
    def __init__(self, category_features=None, scaler=RobustScaler(), sampling_strategy='auto',
                 n_components=None, sampling_technique='smote', k_best=None, contamination=None):
        
        # List of categorical features 
        self.category_features = category_features if category_features is not None else []
        # Robust scaler fucntion for normalizing the features 
        self.scaler = scaler
        # Dictionary for convert categorical features in both training and testing data 
        self.label_encoders = {}
        # Machine learning models 
        self.model = None
        # Sampling strategy for dealing with imbalanced target 
        self.sampling_strategy = sampling_strategy
        self.sampling_technique = sampling_technique
        self.n_components = n_components
        # PCA for dimension reduction 
        self.pca = None if n_components is None else PCA(n_components=n_components)
        # Number of most relevant features 
        self.k_best = k_best
        self.selector = None if k_best is None else SelectKBest(score_func=f_classif, k=k_best)
        # Trasnformer for dealing with skewed data if needed 
        self.transformers = {
            'log': FunctionTransformer(np.log1p, validate=True),
            'sqrt': FunctionTransformer(np.sqrt, validate=True),
            'yeo-johnson': PowerTransformer(method='yeo-johnson')
        }
        self.skew_threshold = 0.5 # The threshold of skewed data that we allow to have 
        self.isolation_forest = None if contamination is None else IsolationForest(contamination=contamination, random_state=42)
        # Isolation forest for outliers detection 
    def preprocess(self, df, is_train=True, skew=False):
        df = df.copy()

        # Handle categorical features
        if self.category_features: # Check if there are categorical features 
            for feature in self.category_features: # Loop through all the features 
                if feature in df.columns: # Check if the features are in the dataframe columns
                    if is_train: # If the features are in the training set 
                        le = LabelEncoder() 
                        df[feature] = le.fit_transform(df[feature]) # Transform categorical data to numerical
                        self.label_encoders[feature] = le  # Reserve the feature'name and encoding method 
                    else:
                        if feature in self.label_encoders: # Check if the feature is in the reserved dictionary 
                            df[feature] = self.label_encoders[feature].transform(df[feature]) 
                            # Encode the data 
                        else:
                            raise ValueError(f"Encoder for column {feature} not found. Ensure training data had this column.")
        
        y = df['churn']
        X = df.drop('churn', axis=1)
        # Outlier detection and removal 
        # Check for outliers in the training set 
        if is_train:
            self.isolation_forest.fit(X)
            X['outliers'] = self.isolation_forest.predict(X)
            outliers = X[X['outliers']==-1].index
            X.drop(outliers, inplace=True)
            y.drop(outliers, inplace=True)
            X.drop('outliers', axis=1, inplace=True)
        
        # Handle skewed features
        if skew:
            if is_train: # Check if it's training set, then use fit_transform
                skewness = X.skew()
                self.skewed_features = skewness[abs(skewness) > self.skew_threshold].index
                for feature in self.skewed_features:
                    X[feature] = self.transformers['yeo-johnson'].fit_transform(X[feature].values.reshape(-1, 1))
            else: # if not training set, then use transform method instead
                for feature in self.skewed_features:
                    X[feature] = self.transformers['yeo-johnson'].transform(X[feature].values.reshape(-1, 1))

        # Scaling
        if is_train: # Fit_transform for training set 
            X = self.scaler.fit_transform(X)
        else: # Transform for testing set 
            X = self.scaler.transform(X)

        # Label encode target variable
        if is_train: # The same for label encoding 
            self.label_encoder_y = LabelEncoder()
            y = self.label_encoder_y.fit_transform(y)
        else:
            y = self.label_encoder_y.transform(y)

        # Feature selection
        if self.selector: # The same for feature selection process 
            if is_train:
                X = self.selector.fit_transform(X, y)
            else:
                X = self.selector.transform(X)
        
        # PCA
        if self.pca: # The same for dimension reduction process
            if is_train:
                X = self.pca.fit_transform(X)
            else:
                X = self.pca.transform(X)
        
        return X, y
    
    def resample(self, X, y): # At first I tried different sampling techniques to see if which one is suitable 
        if self.sampling_technique == 'smote':
            resampler = SMOTE(sampling_strategy=self.sampling_strategy, random_state=42)
        else:
            raise ValueError(f"Unknown sampling technique: {self.sampling_technique}")

        X_res, y_res = resampler.fit_resample(X, y) # I resample the imbalanced data using SMOTE which get the best results
        return X_res, y_res

    def train(self, X, y, model, parameters=None):
        if parameters: # Check if grid-search if needed, if yes, there will be parameters for tuning 
            grid_search = RandomizedSearchCV(model, parameters, cv=3, n_jobs=-1)
            grid_search.fit(X, y)
            self.model = grid_search.best_estimator_
        else: # Else, just simply apply the model
            self.model = model
            self.model.fit(X, y)

    def cross_validate(self, X, y): # Check for cross validation
        if self.model:
            cv_scores = cross_val_score(self.model, X, y, cv=5, n_jobs=-1)
            print('Cross-validation scores:', cv_scores)
            print('Mean cross-validation score:', np.mean(cv_scores)) # Return the mean of all iterrations 

    def evaluate(self, X, y):
        if self.model:
            y_pred = self.model.predict(X) # Predict the churn rate 
            accuracy = accuracy_score(y, y_pred) # Evaluate the accuracy of the churn rate 
            print('Test set accuracy:', accuracy)
            print('Classification report:')
            print(classification_report(y, y_pred)) # Return the reports for the churn rate prediction 
            roc_auc = roc_auc_score(y, self.model.predict_proba(X)[:, 1]) # The return the roc_auc for evaluation
            print('ROC AUC score:', roc_auc)

# Usage Example
if __name__ == "__main__": # Run the code 
    category_features = ['state', 'area_code', 'international_plan', 'voice_mail_plan']
    
    # Use preprocessed data frame with selected features
    df_train = pd.read_csv(r"C:\Users\Admin\Downloads\churn-bigml-80.csv")
    df_train.columns = df_train.columns.str.lower().str.replace(' ', '_')
                           
    df_test = pd.read_csv(r"C:\Users\Admin\Downloads\churn-bigml-20.csv")
    df_test.columns = df_test.columns.str.lower().str.replace(' ', '_')

    # Adjust `k_best` to select the number of top features 
    # Ajust the contamination for the amount of the amount of outliers to be detected, in this case is 5%
    pipeline = ChurnPredictionPipeline(category_features=category_features, n_components=10, k_best=10, contamination=0.05)
    
    # Preprocess the training and testing data 
    X_train, y_train = pipeline.preprocess(df_train, is_train=True)
    X_test, y_test = pipeline.preprocess(df_test, is_train=False)

    # Resample using different techniques
    for sampling_technique in ['smote']:
        print(f"\nUsing sampling technique: {sampling_technique}\n")
        pipeline.sampling_technique = sampling_technique
        X_resampled, y_resampled = pipeline.resample(X_train, y_train)
        
        # Train Random Forest with extended hyperparameters
        rf_params = {
            'n_estimators': [100, 200, 300, 500],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'bootstrap': [True, False]
        }
        pipeline.train(X_resampled, y_resampled, RandomForestClassifier(), rf_params)
        pipeline.cross_validate(X_resampled, y_resampled)
        pipeline.evaluate(X_test, y_test)
        
        # Train XGBoost with extended hyperparameters
        xgb_params = {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7, 10],
            'subsample': [0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
        }
        pipeline.train(X_resampled, y_resampled, XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_params)
        pipeline.cross_validate(X_resampled, y_resampled)
        pipeline.evaluate(X_test, y_test)
Using sampling technique: smote

Cross-validation scores: [0.94495413 0.96440873 0.96326062 0.96096441 0.9586682 ]
Mean cross-validation score: 0.9584512160439862
Test set accuracy: 0.9460269865067467
Classification report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       572
           1       0.85      0.76      0.80        95

    accuracy                           0.95       667
   macro avg       0.90      0.87      0.88       667
weighted avg       0.94      0.95      0.94       667

ROC AUC score: 0.8988038277511963
Cross-validation scores: [0.94151376 0.96555683 0.97129736 0.97244546 0.97014925]
Mean cross-validation score: 0.964192534153509
Test set accuracy: 0.9370314842578711
Classification report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       572
           1       0.80      0.75      0.77        95

    accuracy                           0.94       667
   macro avg       0.88      0.86      0.87       667
weighted avg       0.94      0.94      0.94       667

ROC AUC score: 0.8880934854619066
In [ ]: